// this script imports and combines the components of the distributional data

do "${dodir}/make_globals.do"

////////////////////////////////////////////////////////////////////////////////
// step 1: import the distributional data on ...
////////////////////////////////////////////////////////////////////////////////

// a. ... workers

import delimited "$datadir_dist/ppp_worker_groups_2020", clear

	duplicates drop payer_id, force

save "$datadir_dist/ppp_worker_groups_2020", replace


// b. ... owners

// c corp owners
import delimited "$datadir_dist/ppp_ccorp_owner_groups_2020", clear

save "$datadir_dist/ppp_ccorp_owner_groups_2020", replace

// partnership owners
import delimited "$datadir_dist/ppp_pshp_owner_groups_2020", clear

save "$datadir_dist/ppp_pshp_owner_groups_2020", replace

// scorp owners
import delimited "$datadir_dist/ppp_scorp_owner_groups_2020", clear

save "$datadir_dist/ppp_scorp_owner_groups_2020", replace

// combine the owners data
clear
append using "$datadir_dist/ppp_ccorp_owner_groups_2020"
	gen ent_type = 1

append using "$datadir_dist/ppp_pshp_owner_groups_2020"
	replace ent_type = 2 if ent_type == .

append using "$datadir_dist/ppp_scorp_owner_groups_2020"
	replace ent_type = 3 if ent_type == .

// what are these? - THESE ARE MEANT TO RESOLVE ANY DUPLICATES WITHIN AND BETWEEN THE OWNERSHIP FILES
gen all_p = p20 + p40 + p60 +p80 + p90 + p95 + p99 + p100
gsort - all_p
duplicates drop payer_id, force
drop all_p

duplicates drop 
save "$datadir_dist/ppp_owner_groups_2020", replace

// c. ... schedule C

import delimited "$datadir_dist/ppp_schedc_groups_2020", clear

	duplicates drop

	gsort - pgroup

	duplicates drop id, force

save "$datadir_dist/ppp_schedc_groups_2020", replace

// THIS READS IN THE LOAN OWNERSHIP INFORMATION. THE FIRST FILE HAS LOANNUMBER TO tax ID MATCHES AND THE SECOND PLACES tax records IN THE INCOME DISTRIBUTION
use "$datadir/match/unique_c/sch_c_firm", clear

merge m:1 id using "$datadir_dist/ppp_schedc_groups_2020", nogen keep(matched)

// what are these? - THESE SET THE OWNERSHIP VALUES TO 100% FOR THE GROUP THE OWNER IS IN O_R IS SUPPOSED TO BE THE TAX RATE, BUT WE DO NOT HA VE THAT.
gen o_p = 1

gen o_r = .

reshape wide o_p o_r, i(loannumber id) j(pgroup)

foreach var in o_p20 o_p40 o_p60 o_p80 o_p90 o_p95 o_p99 o_p100 {
	replace `var' = 0 if `var' == .
}

gen ent_type = 4

duplicates drop id, force

save "$datadir_dist/ppp_schedc_groups_2020", replace

// d. UI

import delimited "$datadir_dist/ppp_ui_groups_2020", clear
duplicates drop payer_id, force

save "$datadir_dist/ppp_ui_groups_2020", replace

////////////////////////////////////////////////////////////////////////////////
// step 2. merge them together
////////////////////////////////////////////////////////////////////////////////

use "$datadir_dist/ppp_owner_groups_2020", clear

rename p?? o_p??
rename p100 o_p100

rename r?? o_r??
rename r100 o_r100

merge 1:1 payer_id using "$datadir_dist/ppp_worker_groups_2020", nogen update

rename c* w_c*
rename p?? w_p??
rename p100 w_p100
rename r?? w_r??
rename r100 w_r100

merge 1:1 payer_id using "$datadir_dist/ppp_ui_groups_2020", nogen update

rename c* u_c*
rename p?? u_p??
rename p100 u_p100
rename r?? u_r??
rename r100 u_r100

gen tax_yr = 2020
rename payer_id id

// Merge on the PPP information

merge 1:1 id using "$datadir/ppp_wide_dist", update nogen

// merge on the schedule c
merge 1:1 id using "$datadir_dist/ppp_schedc_groups_2020" , gen(c_match) update keep(1 3 4 5)

foreach ptile in c20 c40 c60 c80 c90 c95 c99 c100 {
	replace u_`ptile' = 0 if u_`ptile' == . & w_`ptile' != .
}
foreach ptile in p20 p40 p60 p80 p90 p95 p99 p100 {
	replace u_`ptile' = 0 if u_`ptile' == . & w_`ptile' != .
}

foreach ptile in r20 r40 r60 r80 r90 r95 r99 r100 {
	replace w_`ptile' = 0 if w_`ptile' == .
	replace o_`ptile' = 0 if o_`ptile' == .
}

save "$datadir/distributional_data", replace
